In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1699]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected PFE
In [1700]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1701]:
pd.set_option('display.max_colwidth', None)
In [1702]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1703]:
del df['Unnamed: 0']
In [1704]:
df.head(5)
Out[1704]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-08-27 36.015179 36.053131 35.749527 35.920303 34.215881 22156556 -0.499348 0.913041 0.122587 0.564586 37.162704 35.762208 36.462456 NaN 1.571774 0.351044 20.820624 NaN NaN NaN -0.294117 NaN -0.008122 48.455694 NaN NaN 28.318531 49.190310 1.353090e+08 2.194523e+07 -1771352.0 0.0 1.231714e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.231714e+06 0.0 0.0 1.231714e+06 3 230 0 233 0 0 233 233
1 2020-08-28 35.929790 36.110058 35.483871 35.967743 34.261074 32830519 0.132069 0.899825 0.151520 0.573386 37.193530 35.636506 36.415018 NaN 1.594167 0.626186 36.071429 NaN NaN NaN -0.142315 NaN -0.003941 49.151619 NaN NaN 24.617159 34.170059 1.532166e+08 2.545080e+07 31059167.0 0.0 1.493263e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.493263e+06 0.0 0.0 1.493263e+06 2 200 0 202 0 0 202 202
2 2020-08-31 35.901329 36.072105 35.578747 35.853889 34.152615 30032465 -0.316543 1.032070 0.165871 0.561953 37.103513 35.474421 36.288967 -0.056865 1.567343 0.493359 36.071429 NaN NaN NaN -0.531311 -0.156283 -0.014602 47.494273 NaN NaN 26.403700 26.446463 1.566820e+08 2.567380e+07 1026702.0 0.0 2.188663e+06 0.0 0.0 0.0 0.0 0.0 0.0 2.188663e+06 0.0 0.0 2.188663e+06 3 253 0 256 0 0 256 256
3 2020-09-01 35.853889 35.882355 34.639469 34.990513 33.330208 36145560 -2.408042 0.819452 0.281973 0.659229 37.079914 34.955868 36.017891 -0.126076 1.884024 1.242886 66.823707 NaN NaN NaN -1.404175 -0.346727 -0.038582 37.239646 NaN NaN 28.580330 26.533730 1.409545e+08 1.842684e+07 -35118858.0 0.0 1.557189e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.557189e+06 0.0 0.0 1.557189e+06 2 218 0 220 0 0 220 220
4 2020-09-02 34.886147 35.388992 34.535103 35.294117 33.619408 29068688 0.867676 0.223173 0.208462 0.687038 36.708761 34.882456 35.795609 -0.150586 1.946608 0.853889 68.771094 NaN NaN NaN -1.005692 -0.414890 -0.027705 41.983408 NaN NaN 34.278714 29.754248 1.635635e+08 2.098080e+07 -6050170.0 0.0 1.908042e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.908042e+06 0.0 0.0 1.908042e+06 1 202 0 203 0 0 203 203
In [1705]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       332 non-null    datetime64[ns]
 1   Open                       332 non-null    float64       
 2   High                       332 non-null    float64       
 3   Low                        332 non-null    float64       
 4   Close                      332 non-null    float64       
 5   Adj Close                  332 non-null    float64       
 6   Volume                     332 non-null    int64         
 7   Return                     332 non-null    float64       
 8   Beta                       332 non-null    float64       
 9   Variance                   332 non-null    float64       
 10  AvgTrueRange               332 non-null    float64       
 11  Upperband                  332 non-null    float64       
 12  Lowerband                  332 non-null    float64       
 13  Middleband                 332 non-null    float64       
 14  APO                        330 non-null    float64       
 15  NATR                       332 non-null    float64       
 16  TRANGE                     332 non-null    float64       
 17  DMI                        332 non-null    float64       
 18  MACD                       322 non-null    float64       
 19  MACDSIGNAL                 322 non-null    float64       
 20  MACDHIST                   322 non-null    float64       
 21  MOM                        332 non-null    float64       
 22  PPO                        330 non-null    float64       
 23  ROCP                       332 non-null    float64       
 24  RSI                        332 non-null    float64       
 25  TRIX                       267 non-null    float64       
 26  ULTOSC                     327 non-null    float64       
 27  SLOWK                      332 non-null    float64       
 28  SLOWD                      332 non-null    float64       
 29  AD                         332 non-null    float64       
 30  ADOSC                      332 non-null    float64       
 31  OBV                        332 non-null    float64       
 32  Upward_momentum_created    332 non-null    float64       
 33  Downward_momentum_created  332 non-null    float64       
 34  B5_O_Um                    332 non-null    float64       
 35  B5_C_Um                    332 non-null    float64       
 36  B5_E_Um                    332 non-null    float64       
 37  B5_A_Um                    332 non-null    float64       
 38  B5_N_Um                    332 non-null    float64       
 39  B5_O_Dm                    332 non-null    float64       
 40  B5_C_Dm                    332 non-null    float64       
 41  B5_E_Dm                    332 non-null    float64       
 42  B5_A_Dm                    332 non-null    float64       
 43  B5_N_Dm                    332 non-null    float64       
 44  Verified_status_True       332 non-null    int64         
 45  Verified_status_False      332 non-null    int64         
 46  O                          332 non-null    int64         
 47  C                          332 non-null    int64         
 48  E                          332 non-null    int64         
 49  A                          332 non-null    int64         
 50  N                          332 non-null    int64         
 51  Real_or_Fake_tweet         332 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 135.0 KB
In [1706]:
df.shape
Out[1706]:
(332, 52)
In [1707]:
sns.set(font_scale=0.8)
In [1708]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1709]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1710]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1711]:
df.head()
Out[1711]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-08-27 36.015179 36.053131 35.749527 35.920303 34.215881 22156556 -0.499348 0.913041 0.122587 0.564586 37.162704 35.762208 36.462456 NaN 1.571774 0.351044 20.820624 NaN NaN NaN -0.294117 NaN -0.008122 48.455694 NaN NaN 28.318531 49.190310 1.353090e+08 2.194523e+07 -1771352.0 0.0 1.231714e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.231714e+06 0.0 0.0 1.231714e+06 3 230 0 233 0 0 233 233 NaN NaN
1 2020-08-28 35.929790 36.110058 35.483871 35.967743 34.261074 32830519 0.132069 0.899825 0.151520 0.573386 37.193530 35.636506 36.415018 NaN 1.594167 0.626186 36.071429 NaN NaN NaN -0.142315 NaN -0.003941 49.151619 NaN NaN 24.617159 34.170059 1.532166e+08 2.545080e+07 31059167.0 0.0 1.493263e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.493263e+06 0.0 0.0 1.493263e+06 2 200 0 202 0 0 202 202 0.132069 0.001320
2 2020-08-31 35.901329 36.072105 35.578747 35.853889 34.152615 30032465 -0.316543 1.032070 0.165871 0.561953 37.103513 35.474421 36.288967 -0.056865 1.567343 0.493359 36.071429 NaN NaN NaN -0.531311 -0.156283 -0.014602 47.494273 NaN NaN 26.403700 26.446463 1.566820e+08 2.567380e+07 1026702.0 0.0 2.188663e+06 0.0 0.0 0.0 0.0 0.0 0.0 2.188663e+06 0.0 0.0 2.188663e+06 3 253 0 256 0 0 256 256 -0.316543 -0.003170
3 2020-09-01 35.853889 35.882355 34.639469 34.990513 33.330208 36145560 -2.408042 0.819452 0.281973 0.659229 37.079914 34.955868 36.017891 -0.126076 1.884024 1.242886 66.823707 NaN NaN NaN -1.404175 -0.346727 -0.038582 37.239646 NaN NaN 28.580330 26.533730 1.409545e+08 1.842684e+07 -35118858.0 0.0 1.557189e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.557189e+06 0.0 0.0 1.557189e+06 2 218 0 220 0 0 220 220 -2.408042 -0.024375
4 2020-09-02 34.886147 35.388992 34.535103 35.294117 33.619408 29068688 0.867676 0.223173 0.208462 0.687038 36.708761 34.882456 35.795609 -0.150586 1.946608 0.853889 68.771094 NaN NaN NaN -1.005692 -0.414890 -0.027705 41.983408 NaN NaN 34.278714 29.754248 1.635635e+08 2.098080e+07 -6050170.0 0.0 1.908042e+06 0.0 0.0 0.0 0.0 0.0 0.0 1.908042e+06 0.0 0.0 1.908042e+06 1 202 0 203 0 0 203 203 0.867676 0.008639
In [1712]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1713]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1714]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1715]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1716]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1717]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1718]:
df.describe()
Out[1718]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 267.000000 267.000000 267.000000 267.000000 267.000000 2.670000e+02 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 2.670000e+02 2.670000e+02 2.670000e+02 267.0 2.670000e+02 267.0 267.0 267.0 267.0 267.0 267.0 2.670000e+02 267.0 267.0 2.670000e+02 267.000000 267.000000 267.0 267.000000 267.0 267.0 267.000000 267.000000 267.000000 267.000000 238.000000 231.000000
mean 41.341536 41.807341 40.922959 41.377079 40.660002 3.456984e+07 0.199170 0.596934 0.656903 0.917862 42.376863 39.864057 41.120460 0.457039 2.142366 0.949925 42.490515 0.415656 0.387353 0.028303 0.730641 1.045262 0.017674 54.849718 0.103090 51.172985 52.658415 52.538121 -3.251774e+08 8.146499e+05 -5.475891e+07 0.0 6.381022e+06 0.0 0.0 0.0 0.0 0.0 0.0 6.381022e+06 0.0 0.0 6.381022e+06 12.453184 430.670412 0.0 443.123596 0.0 0.0 443.123596 443.123596 0.199170 0.001851 0.014150 0.014081
std 5.531748 5.801935 5.363987 5.619693 5.962706 1.908874e+07 1.686194 0.300532 1.327252 0.457317 5.938692 4.634444 5.226860 1.275853 0.768976 0.718035 24.743083 0.884761 0.798631 0.272317 2.428169 2.954448 0.056877 14.231439 0.132967 11.240803 25.684098 24.035088 1.130385e+08 2.759065e+07 2.173441e+08 0.0 7.997408e+06 0.0 0.0 0.0 0.0 0.0 0.0 7.997408e+06 0.0 0.0 7.997408e+06 15.186332 350.930948 0.0 363.537625 0.0 0.0 363.537625 363.537625 1.686194 0.016664 0.006210 0.006287
min 33.509998 33.889999 33.360001 33.490002 32.579582 1.198420e+07 -5.140964 -0.511293 0.006669 0.451289 34.262552 33.195904 33.775714 -1.631153 1.151746 0.250000 0.014580 -0.925396 -0.872224 -0.569351 -5.820000 -4.220739 -0.136748 25.891772 -0.145526 23.576870 4.130361 5.315026 -5.529623e+08 -6.313508e+07 -4.950888e+08 0.0 9.097497e+05 0.0 0.0 0.0 0.0 0.0 0.0 9.097497e+05 0.0 0.0 9.097497e+05 0.000000 115.000000 0.0 119.000000 0.0 0.0 119.000000 119.000000 -5.140964 -0.052778 0.008340 0.008340
25% 37.035000 37.375000 36.735001 37.095001 35.755972 2.237085e+07 -0.670524 0.402016 0.079933 0.583751 37.716244 36.232522 37.004286 -0.496731 1.578542 0.519999 21.828898 -0.289985 -0.251591 -0.118300 -1.005001 -1.356612 -0.026308 44.002747 0.000499 42.289102 29.824059 31.561634 -4.055696e+08 -1.899862e+07 -1.759750e+08 0.0 2.496112e+06 0.0 0.0 0.0 0.0 0.0 0.0 2.496112e+06 0.0 0.0 2.496112e+06 4.000000 228.000000 0.0 233.000000 0.0 0.0 233.000000 233.000000 -0.670524 -0.006728 0.010020 0.009989
50% 39.770000 40.139999 39.430000 39.810001 38.952194 2.902930e+07 0.046741 0.597889 0.200939 0.748204 40.484961 38.888702 39.662858 0.324102 1.848347 0.689999 43.097535 0.221667 0.244118 0.001579 0.500000 0.821568 0.012294 54.056992 0.106476 51.423592 55.462918 53.903710 -3.402830e+08 7.954396e+05 -3.876457e+07 0.0 4.043242e+06 0.0 0.0 0.0 0.0 0.0 0.0 4.043242e+06 0.0 0.0 4.043242e+06 8.000000 298.000000 0.0 308.000000 0.0 0.0 308.000000 308.000000 0.046741 0.000467 0.011079 0.010819
75% 44.209999 44.685001 43.820002 44.074999 43.810438 3.993125e+07 0.853166 0.759535 0.630674 1.118364 45.394228 42.705025 44.015714 1.250288 2.571077 1.094999 60.336901 0.888221 0.821314 0.226816 1.945000 3.235375 0.046725 67.375046 0.188753 60.064678 75.695495 74.398526 -2.485247e+08 1.822913e+07 7.688248e+07 0.0 6.993426e+06 0.0 0.0 0.0 0.0 0.0 0.0 6.993426e+06 0.0 0.0 6.993426e+06 14.000000 511.000000 0.0 540.500000 0.0 0.0 540.500000 540.500000 0.853166 0.008495 0.017888 0.018180
max 60.599998 61.709999 59.830002 61.250000 61.250000 1.737533e+08 10.855193 1.612741 11.378360 2.487073 63.730053 51.718518 57.724286 3.444615 4.349617 4.960003 96.758574 2.880638 2.248791 0.728448 9.540001 7.442927 0.185315 84.689391 0.376382 77.032434 97.931938 94.760281 1.029095e+07 7.363839e+07 4.712694e+08 0.0 6.752093e+07 0.0 0.0 0.0 0.0 0.0 0.0 6.752093e+07 0.0 0.0 6.752093e+07 117.000000 2651.000000 0.0 2745.000000 0.0 0.0 2745.000000 2745.000000 10.855193 0.103055 0.033132 0.033132
In [1719]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1720]:
df = df.fillna(df.median())
In [1721]:
df.isna().sum()
Out[1721]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1722]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 267 entries, 65 to 331
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       267 non-null    datetime64[ns]
 1   Open                       267 non-null    float64       
 2   High                       267 non-null    float64       
 3   Low                        267 non-null    float64       
 4   Close                      267 non-null    float64       
 5   Adj Close                  267 non-null    float64       
 6   Volume                     267 non-null    int64         
 7   Return                     267 non-null    float64       
 8   Beta                       267 non-null    float64       
 9   Variance                   267 non-null    float64       
 10  AvgTrueRange               267 non-null    float64       
 11  Upperband                  267 non-null    float64       
 12  Lowerband                  267 non-null    float64       
 13  Middleband                 267 non-null    float64       
 14  APO                        267 non-null    float64       
 15  NATR                       267 non-null    float64       
 16  TRANGE                     267 non-null    float64       
 17  DMI                        267 non-null    float64       
 18  MACD                       267 non-null    float64       
 19  MACDSIGNAL                 267 non-null    float64       
 20  MACDHIST                   267 non-null    float64       
 21  MOM                        267 non-null    float64       
 22  PPO                        267 non-null    float64       
 23  ROCP                       267 non-null    float64       
 24  RSI                        267 non-null    float64       
 25  TRIX                       267 non-null    float64       
 26  ULTOSC                     267 non-null    float64       
 27  SLOWK                      267 non-null    float64       
 28  SLOWD                      267 non-null    float64       
 29  AD                         267 non-null    float64       
 30  ADOSC                      267 non-null    float64       
 31  OBV                        267 non-null    float64       
 32  Upward_momentum_created    267 non-null    float64       
 33  Downward_momentum_created  267 non-null    float64       
 34  B5_O_Um                    267 non-null    float64       
 35  B5_C_Um                    267 non-null    float64       
 36  B5_E_Um                    267 non-null    float64       
 37  B5_A_Um                    267 non-null    float64       
 38  B5_N_Um                    267 non-null    float64       
 39  B5_O_Dm                    267 non-null    float64       
 40  B5_C_Dm                    267 non-null    float64       
 41  B5_E_Dm                    267 non-null    float64       
 42  B5_A_Dm                    267 non-null    float64       
 43  B5_N_Dm                    267 non-null    float64       
 44  Verified_status_True       267 non-null    int64         
 45  Verified_status_False      267 non-null    int64         
 46  O                          267 non-null    int64         
 47  C                          267 non-null    int64         
 48  E                          267 non-null    int64         
 49  A                          267 non-null    int64         
 50  N                          267 non-null    int64         
 51  Fake_news                  267 non-null    int64         
 52  returns                    267 non-null    float64       
 53  log_returns                267 non-null    float64       
 54  vol_current                267 non-null    float64       
 55  vol_future                 267 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 118.9 KB
In [1723]:
df.shape
Out[1723]:
(267, 56)
In [1724]:
df=df.dropna()
In [1725]:
df.dtypes
Out[1725]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1726]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1726]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077b049450>
In [1727]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1728]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 26 strongly correlated values with AvgTrueRange:
AvgTrueRange             1.000000
NATR                     0.968401
Upperband                0.895775
vol_current              0.874643
High                     0.874121
Open                     0.866028
Close                    0.861778
Middleband               0.858908
Low                      0.853943
Adj Close                0.849940
Lowerband                0.789535
TRANGE                   0.772593
MACDSIGNAL               0.751001
MACD                     0.741723
AD                       0.722621
Variance                 0.700021
Verified_status_False    0.680777
Fake_news                0.675485
C                        0.675485
N                        0.675485
vol_future               0.666066
APO                      0.651308
PPO                      0.608567
OBV                      0.608035
Volume                   0.551998
TRIX                     0.530199
Name: AvgTrueRange, dtype: float64
In [1729]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 25 strongly correlated values with NATR :
NATR                     1.000000
AvgTrueRange             0.968401
Upperband                0.785365
vol_current              0.780166
High                     0.747833
Open                     0.741206
Middleband               0.739468
TRANGE                   0.734947
Close                    0.729652
Low                      0.722409
Adj Close                0.713284
MACDSIGNAL               0.710371
Verified_status_False    0.689165
Fake_news                0.685435
C                        0.685435
N                        0.685435
AD                       0.684641
MACD                     0.684506
Lowerband                0.661599
Variance                 0.636008
APO                      0.622256
vol_future               0.604404
Volume                   0.599570
PPO                      0.591463
OBV                      0.539363
Name: NATR, dtype: float64
In [1730]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 27 strongly correlated values with TRANGE:
TRANGE                       1.000000
Verified_status_False        0.814823
Fake_news                    0.809836
N                            0.809836
C                            0.809836
Volume                       0.794626
AvgTrueRange                 0.772593
NATR                         0.734947
High                         0.705183
Close                        0.684563
Open                         0.680834
Adj Close                    0.675220
Upperband                    0.672636
Low                          0.654646
vol_current                  0.645700
Middleband                   0.635225
MACD                         0.593326
AD                           0.574406
Lowerband                    0.570918
Variance                     0.562711
Verified_status_True         0.557022
MACDSIGNAL                   0.541237
vol_future                   0.533776
APO                          0.530045
Downward_momentum_created    0.516901
B5_C_Dm                      0.516901
B5_N_Dm                      0.516901
Name: TRANGE, dtype: float64
In [1731]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Openness:
Series([], Name: O, dtype: float64)
In [1732]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 26 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
Verified_status_False        0.999719
Volume                       0.847432
Verified_status_True         0.836626
TRANGE                       0.809836
B5_N_Dm                      0.754931
B5_C_Dm                      0.754931
Downward_momentum_created    0.754931
NATR                         0.685435
AvgTrueRange                 0.675485
AD                           0.610988
MACD                         0.585018
High                         0.558222
Close                        0.543664
Upperband                    0.541440
Open                         0.540505
APO                          0.537068
PPO                          0.528365
MACDSIGNAL                   0.528357
Adj Close                    0.523958
Low                          0.520149
OBV                          0.509716
Variance                     0.506832
MOM                          0.503217
Name: C, dtype: float64
In [1733]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [1734]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1735]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 26 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
Verified_status_False        0.999719
Volume                       0.847432
Verified_status_True         0.836626
TRANGE                       0.809836
B5_N_Dm                      0.754931
B5_C_Dm                      0.754931
Downward_momentum_created    0.754931
NATR                         0.685435
AvgTrueRange                 0.675485
AD                           0.610988
MACD                         0.585018
High                         0.558222
Close                        0.543664
Upperband                    0.541440
Open                         0.540505
APO                          0.537068
PPO                          0.528365
MACDSIGNAL                   0.528357
Adj Close                    0.523958
Low                          0.520149
OBV                          0.509716
Variance                     0.506832
MOM                          0.503217
Name: N, dtype: float64
In [1736]:
df.columns
Out[1736]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1737]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1738]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1739]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1740]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1741]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1742]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Dm:
Series([], Name: B5_O_Dm, dtype: float64)
In [1743]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.773798
Fake_news                    0.754931
N                            0.754931
C                            0.754931
Verified_status_False        0.748566
Volume                       0.622318
TRANGE                       0.516901
Name: B5_C_Dm, dtype: float64
In [1744]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [1745]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1746]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.773798
Fake_news                    0.754931
N                            0.754931
C                            0.754931
Verified_status_False        0.748566
Volume                       0.622318
TRANGE                       0.516901
Name: B5_N_Dm, dtype: float64
In [1747]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 26 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
C                            1.000000
Verified_status_False        0.999719
Volume                       0.847432
Verified_status_True         0.836626
TRANGE                       0.809836
B5_N_Dm                      0.754931
B5_C_Dm                      0.754931
Downward_momentum_created    0.754931
NATR                         0.685435
AvgTrueRange                 0.675485
AD                           0.610988
MACD                         0.585018
High                         0.558222
Close                        0.543664
Upperband                    0.541440
Open                         0.540505
APO                          0.537068
PPO                          0.528365
MACDSIGNAL                   0.528357
Adj Close                    0.523958
Low                          0.520149
OBV                          0.509716
Variance                     0.506832
MOM                          0.503217
Name: Fake_news, dtype: float64
In [1748]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.773798
Fake_news                    0.754931
N                            0.754931
C                            0.754931
Verified_status_False        0.748566
Volume                       0.622318
TRANGE                       0.516901
Name: Downward_momentum_created, dtype: float64
In [1749]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1750]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 10 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.836626
N                            0.836626
C                            0.836626
Verified_status_False        0.823407
B5_N_Dm                      0.773798
B5_C_Dm                      0.773798
Downward_momentum_created    0.773798
Volume                       0.690076
TRANGE                       0.557022
Name: Verified_status_True, dtype: float64
In [1751]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 26 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999719
N                            0.999719
C                            0.999719
Volume                       0.848012
Verified_status_True         0.823407
TRANGE                       0.814823
B5_N_Dm                      0.748566
B5_C_Dm                      0.748566
Downward_momentum_created    0.748566
NATR                         0.689165
AvgTrueRange                 0.680777
AD                           0.617635
MACD                         0.591458
High                         0.564752
Close                        0.550282
Upperband                    0.547783
Open                         0.546703
APO                          0.542979
MACDSIGNAL                   0.534580
PPO                          0.533477
Adj Close                    0.530614
Low                          0.526538
OBV                          0.513892
Variance                     0.512482
MOM                          0.507472
Name: Verified_status_False, dtype: float64
In [1752]:
sns.set(font_scale=0.8)
In [1753]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1754]:
df.dtypes
Out[1754]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1755]:
df.isnull().sum()
Out[1755]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1756]:
df.fillna(0, inplace = True)
In [1757]:
df.dropna(inplace=True)
In [1758]:
sns.set(font_scale=0.8)
In [1759]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1760]:
df.describe()
Out[1760]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 267.000000 267.000000 267.000000 267.000000 267.000000 2.670000e+02 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000 2.670000e+02 2.670000e+02 2.670000e+02 267.0 2.670000e+02 267.0 267.0 267.0 267.0 267.0 267.0 2.670000e+02 267.0 267.0 2.670000e+02 267.000000 267.000000 267.0 267.000000 267.0 267.0 267.000000 267.000000 267.000000 267.000000 267.000000 267.000000
mean 41.341536 41.807341 40.922959 41.377079 40.660002 3.456984e+07 0.199170 0.596934 0.656903 0.917862 42.376863 39.864057 41.120460 0.457039 2.142366 0.949925 42.490515 0.415656 0.387353 0.028303 0.730641 1.045262 0.017674 54.849718 0.103090 51.172985 52.658415 52.538121 -3.251774e+08 8.146499e+05 -5.475891e+07 0.0 6.381022e+06 0.0 0.0 0.0 0.0 0.0 0.0 6.381022e+06 0.0 0.0 6.381022e+06 12.453184 430.670412 0.0 443.123596 0.0 0.0 443.123596 443.123596 0.199170 0.001851 0.013816 0.013641
std 5.531748 5.801935 5.363987 5.619693 5.962706 1.908874e+07 1.686194 0.300532 1.327252 0.457317 5.938692 4.634444 5.226860 1.275853 0.768976 0.718035 24.743083 0.884761 0.798631 0.272317 2.428169 2.954448 0.056877 14.231439 0.132967 11.240803 25.684098 24.035088 1.130385e+08 2.759065e+07 2.173441e+08 0.0 7.997408e+06 0.0 0.0 0.0 0.0 0.0 0.0 7.997408e+06 0.0 0.0 7.997408e+06 15.186332 350.930948 0.0 363.537625 0.0 0.0 363.537625 363.537625 1.686194 0.016664 0.005939 0.005952
min 33.509998 33.889999 33.360001 33.490002 32.579582 1.198420e+07 -5.140964 -0.511293 0.006669 0.451289 34.262552 33.195904 33.775714 -1.631153 1.151746 0.250000 0.014580 -0.925396 -0.872224 -0.569351 -5.820000 -4.220739 -0.136748 25.891772 -0.145526 23.576870 4.130361 5.315026 -5.529623e+08 -6.313508e+07 -4.950888e+08 0.0 9.097497e+05 0.0 0.0 0.0 0.0 0.0 0.0 9.097497e+05 0.0 0.0 9.097497e+05 0.000000 115.000000 0.0 119.000000 0.0 0.0 119.000000 119.000000 -5.140964 -0.052778 0.008340 0.008340
25% 37.035000 37.375000 36.735001 37.095001 35.755972 2.237085e+07 -0.670524 0.402016 0.079933 0.583751 37.716244 36.232522 37.004286 -0.496731 1.578542 0.519999 21.828898 -0.289985 -0.251591 -0.118300 -1.005001 -1.356612 -0.026308 44.002747 0.000499 42.289102 29.824059 31.561634 -4.055696e+08 -1.899862e+07 -1.759750e+08 0.0 2.496112e+06 0.0 0.0 0.0 0.0 0.0 0.0 2.496112e+06 0.0 0.0 2.496112e+06 4.000000 228.000000 0.0 233.000000 0.0 0.0 233.000000 233.000000 -0.670524 -0.006728 0.010085 0.010085
50% 39.770000 40.139999 39.430000 39.810001 38.952194 2.902930e+07 0.046741 0.597889 0.200939 0.748204 40.484961 38.888702 39.662858 0.324102 1.848347 0.689999 43.097535 0.221667 0.244118 0.001579 0.500000 0.821568 0.012294 54.056992 0.106476 51.423592 55.462918 53.903710 -3.402830e+08 7.954396e+05 -3.876457e+07 0.0 4.043242e+06 0.0 0.0 0.0 0.0 0.0 0.0 4.043242e+06 0.0 0.0 4.043242e+06 8.000000 298.000000 0.0 308.000000 0.0 0.0 308.000000 308.000000 0.046741 0.000467 0.011079 0.010819
75% 44.209999 44.685001 43.820002 44.074999 43.810438 3.993125e+07 0.853166 0.759535 0.630674 1.118364 45.394228 42.705025 44.015714 1.250288 2.571077 1.094999 60.336901 0.888221 0.821314 0.226816 1.945000 3.235375 0.046725 67.375046 0.188753 60.064678 75.695495 74.398526 -2.485247e+08 1.822913e+07 7.688248e+07 0.0 6.993426e+06 0.0 0.0 0.0 0.0 0.0 0.0 6.993426e+06 0.0 0.0 6.993426e+06 14.000000 511.000000 0.0 540.500000 0.0 0.0 540.500000 540.500000 0.853166 0.008495 0.015439 0.013829
max 60.599998 61.709999 59.830002 61.250000 61.250000 1.737533e+08 10.855193 1.612741 11.378360 2.487073 63.730053 51.718518 57.724286 3.444615 4.349617 4.960003 96.758574 2.880638 2.248791 0.728448 9.540001 7.442927 0.185315 84.689391 0.376382 77.032434 97.931938 94.760281 1.029095e+07 7.363839e+07 4.712694e+08 0.0 6.752093e+07 0.0 0.0 0.0 0.0 0.0 0.0 6.752093e+07 0.0 0.0 6.752093e+07 117.000000 2651.000000 0.0 2745.000000 0.0 0.0 2745.000000 2745.000000 10.855193 0.103055 0.033132 0.033132
In [1761]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1762]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1763]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1767]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected PFE
In [1768]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1769]:
df.columns
Out[1769]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1770]:
df.shape
Out[1770]:
(332, 52)
In [1771]:
df.isnull().sum()
Out[1771]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           2
NATR                          0
TRANGE                        0
DMI                           0
MACD                         10
MACDSIGNAL                   10
MACDHIST                     10
MOM                           0
PPO                           2
ROCP                          0
RSI                           0
TRIX                         65
ULTOSC                        5
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1772]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1773]:
df_weekly = df.resample('W').agg('mean')
In [1774]:
df_weekly.shape
Out[1774]:
(70, 51)
In [1775]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1775]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077312da10>
In [1776]:
sns.set(font_scale=0.8)
In [1777]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1778]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with AvgTrueRange:
AvgTrueRange                 1.000000
NATR                         0.937059
TRANGE                       0.894891
Upperband                    0.832537
High                         0.816945
Verified_status_False        0.814836
N                            0.810274
Fake_news                    0.810274
C                            0.810274
Open                         0.809953
Close                        0.801792
Low                          0.793010
Middleband                   0.788825
Adj Close                    0.784859
Variance                     0.757436
MACD                         0.752191
MACDSIGNAL                   0.747842
Lowerband                    0.716858
Volume                       0.676010
APO                          0.635122
OBV                          0.609844
B5_N_Dm                      0.596301
B5_C_Dm                      0.596301
Downward_momentum_created    0.596301
Verified_status_True         0.589283
PPO                          0.588844
TRIX                         0.541811
MOM                          0.530812
Name: AvgTrueRange, dtype: float64
In [1779]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 24 strongly correlated values with NATR :
NATR                         1.000000
AvgTrueRange                 0.937059
TRANGE                       0.834418
Verified_status_False        0.820175
N                            0.819497
Fake_news                    0.819497
C                            0.819497
Volume                       0.749785
B5_C_Dm                      0.709473
Downward_momentum_created    0.709473
B5_N_Dm                      0.709473
Verified_status_True         0.695119
MACDSIGNAL                   0.622500
MACD                         0.616221
Variance                     0.613847
Upperband                    0.610423
High                         0.585829
Open                         0.578349
Close                        0.563765
Middleband                   0.555865
Low                          0.553680
Adj Close                    0.541932
APO                          0.541390
PPO                          0.511729
Name: NATR, dtype: float64
In [1780]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 28 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.894891
Verified_status_False        0.873534
N                            0.870380
Fake_news                    0.870380
C                            0.870380
NATR                         0.834418
Volume                       0.801932
High                         0.750798
Upperband                    0.744386
Open                         0.737392
Close                        0.733447
Adj Close                    0.719011
Low                          0.715585
MACD                         0.699619
Middleband                   0.699005
Verified_status_True         0.676983
Variance                     0.671003
MACDSIGNAL                   0.648188
Lowerband                    0.626711
APO                          0.612152
B5_C_Dm                      0.610869
Downward_momentum_created    0.610869
B5_N_Dm                      0.610869
MOM                          0.588076
PPO                          0.571412
OBV                          0.562844
ROCP                         0.553559
Name: TRANGE, dtype: float64
In [1781]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Openness:
Series([], Name: O, dtype: float64)
In [1782]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 27 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
Verified_status_False        0.999813
Volume                       0.915956
Verified_status_True         0.873760
TRANGE                       0.870380
B5_N_Dm                      0.832005
B5_C_Dm                      0.832005
Downward_momentum_created    0.832005
NATR                         0.819497
AvgTrueRange                 0.810274
MACD                         0.659968
Variance                     0.619638
MACDSIGNAL                   0.603745
Upperband                    0.601182
High                         0.600643
MOM                          0.592654
Open                         0.590841
ROCP                         0.585983
Close                        0.582813
APO                          0.579894
Low                          0.568294
Adj Close                    0.563709
PPO                          0.563217
OBV                          0.552334
Middleband                   0.545463
Name: C, dtype: float64
In [1783]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [1784]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1785]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 27 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
C                            1.000000
Verified_status_False        0.999813
Volume                       0.915956
Verified_status_True         0.873760
TRANGE                       0.870380
B5_N_Dm                      0.832005
B5_C_Dm                      0.832005
Downward_momentum_created    0.832005
NATR                         0.819497
AvgTrueRange                 0.810274
MACD                         0.659968
Variance                     0.619638
MACDSIGNAL                   0.603745
Upperband                    0.601182
High                         0.600643
MOM                          0.592654
Open                         0.590841
ROCP                         0.585983
Close                        0.582813
APO                          0.579894
Low                          0.568294
Adj Close                    0.563709
PPO                          0.563217
OBV                          0.552334
Middleband                   0.545463
Name: N, dtype: float64
In [1786]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [1787]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [1788]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [1789]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1790]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [1791]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Dm:
Series([], Name: B5_O_Dm, dtype: float64)
In [1792]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.906063
Fake_news                    0.832005
N                            0.832005
C                            0.832005
Verified_status_False        0.824738
Volume                       0.794032
NATR                         0.709473
TRANGE                       0.610869
AvgTrueRange                 0.596301
Name: B5_C_Dm, dtype: float64
In [1793]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [1794]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1795]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.906063
Fake_news                    0.832005
N                            0.832005
C                            0.832005
Verified_status_False        0.824738
Volume                       0.794032
NATR                         0.709473
TRANGE                       0.610869
AvgTrueRange                 0.596301
Name: B5_N_Dm, dtype: float64
In [1796]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 27 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
C                            1.000000
Verified_status_False        0.999813
Volume                       0.915956
Verified_status_True         0.873760
TRANGE                       0.870380
B5_N_Dm                      0.832005
B5_C_Dm                      0.832005
Downward_momentum_created    0.832005
NATR                         0.819497
AvgTrueRange                 0.810274
MACD                         0.659968
Variance                     0.619638
MACDSIGNAL                   0.603745
Upperband                    0.601182
High                         0.600643
MOM                          0.592654
Open                         0.590841
ROCP                         0.585983
Close                        0.582813
APO                          0.579894
Low                          0.568294
Adj Close                    0.563709
PPO                          0.563217
OBV                          0.552334
Middleband                   0.545463
Name: Fake_news, dtype: float64
In [1797]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_C_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.906063
Fake_news                    0.832005
N                            0.832005
C                            0.832005
Verified_status_False        0.824738
Volume                       0.794032
NATR                         0.709473
TRANGE                       0.610869
AvgTrueRange                 0.596301
Name: Downward_momentum_created, dtype: float64
In [1798]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [1799]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 12 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.906063
B5_C_Dm                      0.906063
Downward_momentum_created    0.906063
Fake_news                    0.873760
N                            0.873760
C                            0.873760
Verified_status_False        0.864204
Volume                       0.854952
NATR                         0.695119
TRANGE                       0.676983
AvgTrueRange                 0.589283
Name: Verified_status_True, dtype: float64
In [1800]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 27 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999813
N                            0.999813
C                            0.999813
Volume                       0.913616
TRANGE                       0.873534
Verified_status_True         0.864204
B5_N_Dm                      0.824738
B5_C_Dm                      0.824738
Downward_momentum_created    0.824738
NATR                         0.820175
AvgTrueRange                 0.814836
MACD                         0.667845
Variance                     0.627763
MACDSIGNAL                   0.611341
Upperband                    0.609379
High                         0.608804
Open                         0.598892
MOM                          0.598256
Close                        0.591182
ROCP                         0.590005
APO                          0.586511
Low                          0.576631
Adj Close                    0.572033
PPO                          0.568813
OBV                          0.557379
Middleband                   0.553511
Name: Verified_status_False, dtype: float64
In [1801]:
sns.set(font_scale=0.8)
In [1802]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1803]:
df_weekly.fillna(0, inplace = True)
In [1804]:
df_weekly.dropna(inplace=True)
In [1805]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1806]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();